{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint\t\t\t\tmodel.ckpt-1000000.index\r\n",
      "model.ckpt-1000000.data-00000-of-00002\tmodel.ckpt-1000000.meta\r\n",
      "model.ckpt-1000000.data-00001-of-00002\toperative_config.gin\r\n"
     ]
    }
   ],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/t5-3x-super-tiny-2021-07-28.tar.gz\n",
    "# !tar -zxf t5-3x-super-tiny-2021-07-28.tar.gz\n",
    "# !rm t5-3x-super-tiny-2021-07-28.tar.gz\n",
    "!ls t5-3x-super-tiny-v3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Config, T5Model, load_tf_weights_in_t5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "out = 't5-3x-super-tiny'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "import mesh_tensorflow.optimize\r\n",
      "import mesh_tensorflow.transformer.dataset\r\n",
      "import mesh_tensorflow.transformer.learning_rate_schedules\r\n",
      "import mesh_tensorflow.transformer.t2t_vocabulary\r\n",
      "import mesh_tensorflow.transformer.transformer_layers\r\n",
      "import mesh_tensorflow.transformer.utils\r\n",
      "import t5.data.sentencepiece_vocabulary\r\n",
      "import t5.models.mesh_transformer\r\n",
      "\r\n",
      "# Macros:\r\n",
      "# ==============================================================================\r\n",
      "d_ff = 256\r\n",
      "d_kv = 64\r\n",
      "d_model = 64\r\n",
      "dropout_rate = 0.0\r\n",
      "init_checkpoint = \\\r\n",
      "    'gs://t5-data/pretrained_models/super-super-tiny/model.ckpt-1000000'\r\n",
      "inputs_length = 512\r\n",
      "mean_noise_span_length = 3.0\r\n",
      "MIXTURE_NAME = 'all_mix'\r\n",
      "noise_density = 0.15\r\n",
      "num_heads = 6\r\n",
      "num_layers = 1\r\n",
      "\r\n",
      "# Parameters for AdafactorOptimizer:\r\n",
      "# ==============================================================================\r\n",
      "AdafactorOptimizer.beta1 = 0.0\r\n",
      "AdafactorOptimizer.clipping_threshold = 1.0\r\n",
      "AdafactorOptimizer.decay_rate = None\r\n",
      "AdafactorOptimizer.epsilon1 = 1e-30\r\n",
      "AdafactorOptimizer.epsilon2 = 0.001\r\n",
      "AdafactorOptimizer.factored = True\r\n",
      "AdafactorOptimizer.min_dim_size_to_factor = 128\r\n",
      "AdafactorOptimizer.multiply_by_parameter_scale = True\r\n",
      "\r\n",
      "# Parameters for Bitransformer:\r\n",
      "# ==============================================================================\r\n",
      "Bitransformer.shared_embedding = True\r\n",
      "\r\n",
      "# Parameters for denoise:\r\n",
      "# ==============================================================================\r\n",
      "denoise.inputs_fn = @preprocessors.noise_span_to_unique_sentinel\r\n",
      "denoise.noise_density = %noise_density\r\n",
      "denoise.noise_mask_fn = @preprocessors.random_spans_noise_mask\r\n",
      "denoise.targets_fn = @preprocessors.nonnoise_span_to_unique_sentinel\r\n",
      "\r\n",
      "# Parameters for decoder/DenseReluDense:\r\n",
      "# ==============================================================================\r\n",
      "decoder/DenseReluDense.activation = 'relu'\r\n",
      "decoder/DenseReluDense.dropout_rate = %dropout_rate\r\n",
      "decoder/DenseReluDense.hidden_size = %d_ff\r\n",
      "\r\n",
      "# Parameters for encoder/DenseReluDense:\r\n",
      "# ==============================================================================\r\n",
      "encoder/DenseReluDense.activation = 'relu'\r\n",
      "encoder/DenseReluDense.dropout_rate = %dropout_rate\r\n",
      "encoder/DenseReluDense.hidden_size = %d_ff\r\n",
      "\r\n",
      "# Parameters for decoder/EncDecAttention:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for get_variable_dtype:\r\n",
      "# ==============================================================================\r\n",
      "get_variable_dtype.activation_dtype = 'bfloat16'\r\n",
      "\r\n",
      "# Parameters for get_vocab_embedding_cls:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for decoder/LayerStack:\r\n",
      "# ==============================================================================\r\n",
      "decoder/LayerStack.dropout_rate = %dropout_rate\r\n",
      "decoder/LayerStack.norm_epsilon = 1e-06\r\n",
      "decoder/LayerStack.recompute_grads = False\r\n",
      "\r\n",
      "# Parameters for encoder/LayerStack:\r\n",
      "# ==============================================================================\r\n",
      "encoder/LayerStack.dropout_rate = %dropout_rate\r\n",
      "encoder/LayerStack.norm_epsilon = 1e-06\r\n",
      "encoder/LayerStack.recompute_grads = False\r\n",
      "\r\n",
      "# Parameters for make_bitransformer:\r\n",
      "# ==============================================================================\r\n",
      "make_bitransformer.decoder_name = 'decoder'\r\n",
      "make_bitransformer.encoder_name = 'encoder'\r\n",
      "\r\n",
      "# Parameters for decoder/make_layer_stack:\r\n",
      "# ==============================================================================\r\n",
      "decoder/make_layer_stack.block_scope = True\r\n",
      "decoder/make_layer_stack.layers = \\\r\n",
      "    [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,\r\n",
      "     @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,\r\n",
      "     @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]\r\n",
      "decoder/make_layer_stack.num_layers = %num_layers\r\n",
      "\r\n",
      "# Parameters for encoder/make_layer_stack:\r\n",
      "# ==============================================================================\r\n",
      "encoder/make_layer_stack.block_scope = True\r\n",
      "encoder/make_layer_stack.layers = \\\r\n",
      "    [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,\r\n",
      "     @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]\r\n",
      "encoder/make_layer_stack.num_layers = %num_layers\r\n",
      "\r\n",
      "# Parameters for mesh_train_dataset_fn:\r\n",
      "# ==============================================================================\r\n",
      "mesh_train_dataset_fn.use_cached = False\r\n",
      "\r\n",
      "# Parameters for MtfModel:\r\n",
      "# ==============================================================================\r\n",
      "MtfModel.autostack = True\r\n",
      "MtfModel.ensemble_inputs = None\r\n",
      "MtfModel.gcp_project = None\r\n",
      "MtfModel.layout_rules = \\\r\n",
      "    'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'\r\n",
      "MtfModel.mesh_devices = None\r\n",
      "MtfModel.mesh_shape = None\r\n",
      "MtfModel.model_type = 'bitransformer'\r\n",
      "MtfModel.optimizer = None\r\n",
      "MtfModel.predict_fn = None\r\n",
      "MtfModel.tpu_job_name = None\r\n",
      "MtfModel.tpu_zone = None\r\n",
      "MtfModel.variable_filter = None\r\n",
      "\r\n",
      "# Parameters for noise_span_to_unique_sentinel:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for nonnoise_span_to_unique_sentinel:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for num_parallel_calls:\r\n",
      "# ==============================================================================\r\n",
      "num_parallel_calls.deterministic = False\r\n",
      "\r\n",
      "# Parameters for pack_dataset:\r\n",
      "# ==============================================================================\r\n",
      "pack_dataset.use_custom_ops = False\r\n",
      "\r\n",
      "# Parameters for pack_or_pad:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for random_spans_helper:\r\n",
      "# ==============================================================================\r\n",
      "random_spans_helper.extra_tokens_per_span_inputs = 1\r\n",
      "random_spans_helper.extra_tokens_per_span_targets = 1\r\n",
      "random_spans_helper.inputs_length = %inputs_length\r\n",
      "random_spans_helper.mean_noise_span_length = %mean_noise_span_length\r\n",
      "random_spans_helper.noise_density = %noise_density\r\n",
      "\r\n",
      "# Parameters for random_spans_noise_mask:\r\n",
      "# ==============================================================================\r\n",
      "random_spans_noise_mask.mean_noise_span_length = %mean_noise_span_length\r\n",
      "\r\n",
      "# Parameters for random_spans_tokens_length:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n",
      "\r\n",
      "# Parameters for reduce_concat_tokens:\r\n",
      "# ==============================================================================\r\n",
      "reduce_concat_tokens.batch_size = 128\r\n",
      "reduce_concat_tokens.feature_key = 'targets'\r\n",
      "\r\n",
      "# Parameters for select_random_chunk:\r\n",
      "# ==============================================================================\r\n",
      "select_random_chunk.feature_key = 'targets'\r\n",
      "select_random_chunk.max_length = 65536\r\n",
      "\r\n",
      "# Parameters for decoder/SelfAttention:\r\n",
      "# ==============================================================================\r\n",
      "decoder/SelfAttention.attention_func = None\r\n",
      "decoder/SelfAttention.attention_kwargs = None\r\n",
      "decoder/SelfAttention.combine_dims = True\r\n",
      "decoder/SelfAttention.dropout_rate = %dropout_rate\r\n",
      "decoder/SelfAttention.keep_query_heads_dims = False\r\n",
      "decoder/SelfAttention.key_value_size = %d_kv\r\n",
      "decoder/SelfAttention.num_heads = %num_heads\r\n",
      "decoder/SelfAttention.num_memory_heads = 0\r\n",
      "decoder/SelfAttention.relative_attention_num_buckets = 32\r\n",
      "decoder/SelfAttention.relative_attention_type = 'bias_shared'\r\n",
      "decoder/SelfAttention.shared_kv = False\r\n",
      "\r\n",
      "# Parameters for encoder/SelfAttention:\r\n",
      "# ==============================================================================\r\n",
      "encoder/SelfAttention.attention_func = None\r\n",
      "encoder/SelfAttention.attention_kwargs = None\r\n",
      "encoder/SelfAttention.combine_dims = True\r\n",
      "encoder/SelfAttention.dropout_rate = %dropout_rate\r\n",
      "encoder/SelfAttention.keep_query_heads_dims = False\r\n",
      "encoder/SelfAttention.key_value_size = %d_kv\r\n",
      "encoder/SelfAttention.num_heads = %num_heads\r\n",
      "encoder/SelfAttention.num_memory_heads = 0\r\n",
      "encoder/SelfAttention.relative_attention_num_buckets = 32\r\n",
      "encoder/SelfAttention.relative_attention_type = 'bias_shared'\r\n",
      "encoder/SelfAttention.shared_kv = False\r\n",
      "\r\n",
      "# Parameters for SentencePieceVocabulary:\r\n",
      "# ==============================================================================\r\n",
      "SentencePieceVocabulary.extra_ids = 100\r\n",
      "\r\n",
      "# Parameters for serialize_num_microbatches:\r\n",
      "# ==============================================================================\r\n",
      "serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192\r\n",
      "\r\n",
      "# Parameters for shift_targets:\r\n",
      "# ==============================================================================\r\n",
      "shift_targets.bos_id = 0\r\n",
      "shift_targets.eos_id = 1\r\n",
      "\r\n",
      "# Parameters for split_tokens:\r\n",
      "# ==============================================================================\r\n",
      "split_tokens.feature_key = 'targets'\r\n",
      "split_tokens.max_tokens_per_segment = @preprocessors.random_spans_tokens_length()\r\n",
      "split_tokens.min_tokens_per_segment = None\r\n",
      "\r\n",
      "# Parameters for tpu_estimator_model_fn:\r\n",
      "# ==============================================================================\r\n",
      "tpu_estimator_model_fn.model_info_file = None\r\n",
      "tpu_estimator_model_fn.outer_batch_size = 1\r\n",
      "tpu_estimator_model_fn.tpu_summaries = False\r\n",
      "\r\n",
      "# Parameters for tpu_mesh_shape:\r\n",
      "# ==============================================================================\r\n",
      "tpu_mesh_shape.ensemble_parallelism = None\r\n",
      "\r\n",
      "# Parameters for decoder/Unitransformer:\r\n",
      "# ==============================================================================\r\n",
      "decoder/Unitransformer.d_model = %d_model\r\n",
      "decoder/Unitransformer.ensemble = None\r\n",
      "decoder/Unitransformer.input_full_attention = False\r\n",
      "decoder/Unitransformer.label_smoothing = 0.0\r\n",
      "decoder/Unitransformer.loss_denominator = 233472\r\n",
      "decoder/Unitransformer.loss_fn = None\r\n",
      "decoder/Unitransformer.loss_on_targets_only = False\r\n",
      "decoder/Unitransformer.max_length = 512\r\n",
      "decoder/Unitransformer.positional_embedding = False\r\n",
      "decoder/Unitransformer.shared_embedding_and_softmax_weights = True\r\n",
      "decoder/Unitransformer.sinusoid_positional_embedding = False\r\n",
      "decoder/Unitransformer.token_dropout_rate = 0.0\r\n",
      "decoder/Unitransformer.vocab_divisor = 128\r\n",
      "decoder/Unitransformer.z_loss = 0.0001\r\n",
      "\r\n",
      "# Parameters for encoder/Unitransformer:\r\n",
      "# ==============================================================================\r\n",
      "encoder/Unitransformer.d_model = %d_model\r\n",
      "encoder/Unitransformer.ensemble = None\r\n",
      "encoder/Unitransformer.input_full_attention = False\r\n",
      "encoder/Unitransformer.label_smoothing = 0.0\r\n",
      "encoder/Unitransformer.loss_denominator = None\r\n",
      "encoder/Unitransformer.loss_fn = None\r\n",
      "encoder/Unitransformer.loss_on_targets_only = False\r\n",
      "encoder/Unitransformer.max_length = 512\r\n",
      "encoder/Unitransformer.positional_embedding = False\r\n",
      "encoder/Unitransformer.shared_embedding_and_softmax_weights = True\r\n",
      "encoder/Unitransformer.sinusoid_positional_embedding = False\r\n",
      "encoder/Unitransformer.token_dropout_rate = 0.0\r\n",
      "encoder/Unitransformer.vocab_divisor = 128\r\n",
      "encoder/Unitransformer.z_loss = 0.0001\r\n",
      "\r\n",
      "# Parameters for unsupervised:\r\n",
      "# ==============================================================================\r\n",
      "unsupervised.preprocessors = \\\r\n",
      "    [@preprocessors.select_random_chunk,\r\n",
      "     @preprocessors.reduce_concat_tokens,\r\n",
      "     @preprocessors.split_tokens,\r\n",
      "     @preprocessors.denoise]\r\n",
      "\r\n",
      "# Parameters for VarianceScalingInitializer:\r\n",
      "# ==============================================================================\r\n",
      "VarianceScalingInitializer.distribution = 'normal'\r\n",
      "VarianceScalingInitializer.mode = 'fan_in'\r\n",
      "VarianceScalingInitializer.scale = 1.0\r\n",
      "\r\n",
      "# Parameters for VocabEmbedding:\r\n",
      "# ==============================================================================\r\n",
      "# None.\r\n"
     ]
    }
   ],
   "source": [
    "!cat t5-3x-super-tiny-v3/operative_config.gin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('decoder/block_000/layer_000/SelfAttention/k', [64, 384]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/k_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/o', [384, 64]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/o_slot_v', [384, 64]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/q', [64, 384]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/q_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/relative_attention_bias',\n",
       "  [6, 32]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v',\n",
       "  [6, 32]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/v', [64, 384]),\n",
       " ('decoder/block_000/layer_000/SelfAttention/v_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_000/layer_norm/scale', [64]),\n",
       " ('decoder/block_000/layer_000/layer_norm/scale_slot_v', [64]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/k', [64, 384]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/k_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/o', [384, 64]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/o_slot_v', [384, 64]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/q', [64, 384]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/q_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/v', [64, 384]),\n",
       " ('decoder/block_000/layer_001/EncDecAttention/v_slot_v', [64, 384]),\n",
       " ('decoder/block_000/layer_001/layer_norm/scale', [64]),\n",
       " ('decoder/block_000/layer_001/layer_norm/scale_slot_v', [64]),\n",
       " ('decoder/block_000/layer_002/DenseReluDense/wi/kernel', [64, 256]),\n",
       " ('decoder/block_000/layer_002/DenseReluDense/wi/kernel_slot_v', [64, 256]),\n",
       " ('decoder/block_000/layer_002/DenseReluDense/wo/kernel', [256, 64]),\n",
       " ('decoder/block_000/layer_002/DenseReluDense/wo/kernel_slot_v', [256, 64]),\n",
       " ('decoder/block_000/layer_002/layer_norm/scale', [64]),\n",
       " ('decoder/block_000/layer_002/layer_norm/scale_slot_v', [64]),\n",
       " ('decoder/final_layer_norm/scale', [64]),\n",
       " ('decoder/final_layer_norm/scale_slot_v', [64]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/k', [64, 384]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/k_slot_v', [64, 384]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/o', [384, 64]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/o_slot_v', [384, 64]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/q', [64, 384]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/q_slot_v', [64, 384]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/relative_attention_bias',\n",
       "  [6, 32]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/relative_attention_bias_slot_v',\n",
       "  [6, 32]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/v', [64, 384]),\n",
       " ('encoder/block_000/layer_000/SelfAttention/v_slot_v', [64, 384]),\n",
       " ('encoder/block_000/layer_000/layer_norm/scale', [64]),\n",
       " ('encoder/block_000/layer_000/layer_norm/scale_slot_v', [64]),\n",
       " ('encoder/block_000/layer_001/DenseReluDense/wi/kernel', [64, 256]),\n",
       " ('encoder/block_000/layer_001/DenseReluDense/wi/kernel_slot_v', [64, 256]),\n",
       " ('encoder/block_000/layer_001/DenseReluDense/wo/kernel', [256, 64]),\n",
       " ('encoder/block_000/layer_001/DenseReluDense/wo/kernel_slot_v', [256, 64]),\n",
       " ('encoder/block_000/layer_001/layer_norm/scale', [64]),\n",
       " ('encoder/block_000/layer_001/layer_norm/scale_slot_v', [64]),\n",
       " ('encoder/final_layer_norm/scale', [64]),\n",
       " ('encoder/final_layer_norm/scale_slot_v', [64]),\n",
       " ('global_step', []),\n",
       " ('shared/embedding', [32128, 64]),\n",
       " ('shared/embedding_slot_v', [32128, 64])]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "tf.train.list_variables('t5-3x-super-tiny-v3/model.ckpt-1000000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "T5Config {\n",
      "  \"d_ff\": 256,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 64,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dense_act_fn\": \"relu\",\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"inputs_length\": 1024,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"is_gated_act\": false,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 1024,\n",
      "  \"num_decoder_layers\": 1,\n",
      "  \"num_heads\": 6,\n",
      "  \"num_layers\": 1,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_max_distance\": 128,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"transformers_version\": \"4.21.2\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "config = T5Config(\n",
    "    vocab_size = 32128,\n",
    "    n_positions=1024,\n",
    "    d_ff = 256,\n",
    "    d_kv = 64,\n",
    "    d_model = 64,\n",
    "    dropout_rate = 0.1,\n",
    "    inputs_length = 512,\n",
    "    num_heads = 6,\n",
    "    num_layers = 1,\n",
    "    decoder_start_token_id = 0,\n",
    "    eos_token_id = 1,\n",
    "    pad_token_id = 0)\n",
    "print(config)\n",
    "config.save_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "T5Model(\n",
       "  (shared): Embedding(32128, 64)\n",
       "  (encoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 64)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 6)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=64, out_features=256, bias=False)\n",
       "              (wo): Linear(in_features=256, out_features=64, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): T5LayerNorm()\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       "  (decoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 64)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 6)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=64, out_features=256, bias=False)\n",
       "              (wo): Linear(in_features=256, out_features=64, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): T5LayerNorm()\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = T5Model(config)\n",
    "load_tf_weights_in_t5(model, config, 't5-3x-super-tiny-v3/model.ckpt-1000000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('config.json', 'pytorch_model.bin')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import CONFIG_NAME, WEIGHTS_NAME\n",
    "CONFIG_NAME, WEIGHTS_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "torch.save(model.state_dict(), out + '/' + WEIGHTS_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Config, T5Model, T5Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en-4k.model\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en.model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('t5-3x-super-tiny/tokenizer_config.json',\n",
       " 't5-3x-super-tiny/special_tokens_map.json',\n",
       " 't5-3x-super-tiny/spiece.model',\n",
       " 't5-3x-super-tiny/added_tokens.json')"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = T5Tokenizer('sp10m.cased.ms-en.model')\n",
    "tokenizer.save_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('t5-3x-super-tiny/tokenizer_config.json',\n",
       " 't5-3x-super-tiny/special_tokens_map.json',\n",
       " 't5-3x-super-tiny/spiece.model',\n",
       " 't5-3x-super-tiny/added_tokens.json')"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = T5Tokenizer.from_pretrained(f'./{out}', lower = False)\n",
    "tokenizer.save_pretrained(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "T5Config {\n",
       "  \"d_ff\": 256,\n",
       "  \"d_kv\": 64,\n",
       "  \"d_model\": 64,\n",
       "  \"decoder_start_token_id\": 0,\n",
       "  \"dense_act_fn\": \"relu\",\n",
       "  \"dropout_rate\": 0.1,\n",
       "  \"eos_token_id\": 1,\n",
       "  \"feed_forward_proj\": \"relu\",\n",
       "  \"initializer_factor\": 1.0,\n",
       "  \"inputs_length\": 1024,\n",
       "  \"is_encoder_decoder\": true,\n",
       "  \"is_gated_act\": false,\n",
       "  \"layer_norm_epsilon\": 1e-06,\n",
       "  \"model_type\": \"t5\",\n",
       "  \"n_positions\": 1024,\n",
       "  \"num_decoder_layers\": 1,\n",
       "  \"num_heads\": 6,\n",
       "  \"num_layers\": 1,\n",
       "  \"pad_token_id\": 0,\n",
       "  \"relative_attention_max_distance\": 128,\n",
       "  \"relative_attention_num_buckets\": 32,\n",
       "  \"transformers_version\": \"4.21.2\",\n",
       "  \"use_cache\": true,\n",
       "  \"vocab_size\": 32128\n",
       "}"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config = T5Config.from_pretrained(f'./{out}')\n",
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "T5Model(\n",
       "  (shared): Embedding(32128, 64)\n",
       "  (encoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 64)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 6)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=64, out_features=256, bias=False)\n",
       "              (wo): Linear(in_features=256, out_features=64, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): T5LayerNorm()\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       "  (decoder): T5Stack(\n",
       "    (embed_tokens): Embedding(32128, 64)\n",
       "    (block): ModuleList(\n",
       "      (0): T5Block(\n",
       "        (layer): ModuleList(\n",
       "          (0): T5LayerSelfAttention(\n",
       "            (SelfAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "              (relative_attention_bias): Embedding(32, 6)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (1): T5LayerCrossAttention(\n",
       "            (EncDecAttention): T5Attention(\n",
       "              (q): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (k): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (v): Linear(in_features=64, out_features=384, bias=False)\n",
       "              (o): Linear(in_features=384, out_features=64, bias=False)\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "          (2): T5LayerFF(\n",
       "            (DenseReluDense): T5DenseActDense(\n",
       "              (wi): Linear(in_features=64, out_features=256, bias=False)\n",
       "              (wo): Linear(in_features=256, out_features=64, bias=False)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "              (act): ReLU()\n",
       "            )\n",
       "            (layer_norm): T5LayerNorm()\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (final_layer_norm): T5LayerNorm()\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = T5Model.from_pretrained(f'./{out}/pytorch_model.bin', config = config)\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Tokenizer, T5ForConditionalGeneration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dbeae47395e544d6bdedbf13764137b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading spiece.model:   0%|          | 0.00/784k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b1b21b64c3fc4df8aefbe95dbe813a7e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading special_tokens_map.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4da837ce942d48359418c384f57ad3a3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tokenizer_config.json:   0%|          | 0.00/2.51k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7a29fd98cb14759bfcb9242c0f6ecdd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13b3a192900f4eca819657bdc0356f8a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading pytorch_model.bin:   0%|          | 0.00/9.23M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/transformers/generation_utils.py:1202: UserWarning: Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to 20 (`self.config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'<pad> David Bowie</s>'"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = T5Tokenizer.from_pretrained(f'./{out}')\n",
    "model = T5ForConditionalGeneration.from_pretrained(f'./{out}')\n",
    "input_ids = tokenizer.encode('soalan: siapakah perdana menteri malaysia?', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids)\n",
    "tokenizer.decode(outputs[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7.71 ms, sys: 0 ns, total: 7.71 ms\n",
      "Wall time: 4.57 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'<pad> gunung besar</s>'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "input_ids = tokenizer.encode('soalan: gunung apa paling tinggi?', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids)\n",
    "tokenizer.decode(outputs[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:38: FutureWarning: Deprecated positional argument(s) used in 'create_repo': pass token='t5-3x-super-tiny-standard-bahasa-cased' as keyword args. From version 0.12 passing these as positional arguments will result in an error,\n",
      "  warnings.warn(\n",
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:102: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.10. Pass `repo_id` instead.\n",
      "  warnings.warn(\n",
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:681: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
      "  warnings.warn(\n",
      "Cloning https://huggingface.co/mesolitica/t5-3x-super-tiny-standard-bahasa-cased into local empty directory.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7519e5b5bf9f408081f70d5998cd33d6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file pytorch_model.bin:   0%|          | 32.0k/9.23M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "remote: Scanning LFS files for validity, may be slow...        \n",
      "remote: LFS file scan complete.        \n",
      "To https://huggingface.co/mesolitica/t5-3x-super-tiny-standard-bahasa-cased\n",
      "   57c40f4..1c779d6  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/t5-3x-super-tiny-standard-bahasa-cased/commit/1c779d6833b9adfc951332aabbd5b3f7a36d16ec'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('t5-3x-super-tiny-standard-bahasa-cased', organization='mesolitica',)\n",
    "tokenizer.push_to_hub('t5-3x-super-tiny-standard-bahasa-cased', organization='mesolitica',)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf t5-3x-super-tiny-v3 t5-3x-super-tiny-standard-bahasa-cased t5-3x-super-tiny"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
